Since each range of data is different, we firstly normalize the data into 0-1. We want to see the distribution of each variable. So we choose to use boxplot. The result shows that there are a lot of outliers in area, hoa, rent amount, property tax and fire insurance. Since the dataset is large, we choose 100 samples to show its figure of stars. Each sample has its own features.
library(readr)
house_data <- read_csv("Dataset/Rent_House.csv")
## Rows: 10677 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): floor, animal, furniture
## dbl (8): area, rooms, bathroom, parking spaces, hoa, rent amount, property t...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(house_data)
## spc_tbl_ [10,677 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ area : num [1:10677] 700 350 486 318 80 900 540 425 200 350 ...
## $ rooms : num [1:10677] 4 3 8 4 2 3 6 4 4 3 ...
## $ bathroom : num [1:10677] 7 3 4 3 1 4 8 4 3 3 ...
## $ parking spaces: num [1:10677] 8 3 6 0 1 8 3 0 0 3 ...
## $ floor : chr [1:10677] "-" "-" "-" "-" ...
## $ animal : chr [1:10677] "acept" "acept" "acept" "acept" ...
## $ furniture : chr [1:10677] "not furnished" "not furnished" "not furnished" "not furnished" ...
## $ hoa : num [1:10677] 0 0 0 0 875 0 0 0 0 0 ...
## $ rent amount : num [1:10677] 45000 30000 25000 19000 24000 20000 15000 15000 15000 15000 ...
## $ property tax : num [1:10677] 8750 560 2200 384 0 ...
## $ fire insurance: num [1:10677] 677 451 376 338 305 301 267 267 267 267 ...
## - attr(*, "spec")=
## .. cols(
## .. area = col_double(),
## .. rooms = col_double(),
## .. bathroom = col_double(),
## .. `parking spaces` = col_double(),
## .. floor = col_character(),
## .. animal = col_character(),
## .. furniture = col_character(),
## .. hoa = col_double(),
## .. `rent amount` = col_double(),
## .. `property tax` = col_double(),
## .. `fire insurance` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
normalized_house_data <- as.data.frame(lapply(house_data[, c(1:4, 8:11)], function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}))
boxplot(normalized_house_data)
set.seed(100)
random_indices <- sample(nrow(normalized_house_data), 100)
house_data_normalized <- cbind(normalized_house_data, animal = house_data[, 6])
stars(house_data_normalized[random_indices, ], label = normalized_house_data$animals)
To find the relationship between area and rent price, we make a
scatterplot of these two variables. It shows that they are postively
correlated. In addition, we list a correlation table for all variables.
It can find relationship between the independent variable and other
dependent variables.
# Dive Deeper into Visualization Techniques
attach(house_data)
#Plots
houses_prices <- data.frame(as.numeric(rownames(house_data)),house_data[,c(1:4, 8:11)])
labs.diagonal <- c("Area","Rooms","Number of Bathrooms","Parking Space","floors", "HOA", "Rent Price", "Property Tax", "fire insurance")
plot(`area`, `rent amount`, xlab="Area",ylab="Rent Price",pch=c(16,1))
pairs(house_data[,c(1:4, 8:11)])
These two figures shows the result of effect of animal and furniture to
other variables. The results are not significant.
library(GGally)
## 载入需要的程辑包:ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggscatmat(house_data, columns=c(1:4, 8:11), color = "animal")
library(GGally)
ggscatmat(house_data, columns=c(1:4, 8:11), color = "furniture")
# load packages
library(lattice)
library(ggplot2)
library(ggridges)
library(ggvis)
##
## 载入程辑包:'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
library(ggthemes)
library(cowplot)
##
## 载入程辑包:'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
library(gapminder)
## Warning: 程辑包'gapminder'是用R版本4.1.2 来建造的
library(gganimate)
## Warning: 程辑包'gganimate'是用R版本4.1.2 来建造的
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
##
## 载入程辑包:'gganimate'
## The following object is masked from 'package:ggvis':
##
## view_static
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: 程辑包'tidyverse'是用R版本4.1.2 来建造的
## Warning: 程辑包'tibble'是用R版本4.1.2 来建造的
## Warning: 程辑包'forcats'是用R版本4.1.2 来建造的
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggvis::resolution() masks ggplot2::resolution()
## ✖ lubridate::stamp() masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(grid)
library(gridExtra)
##
## 载入程辑包:'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
## Warning: 程辑包'RColorBrewer'是用R版本4.1.2 来建造的
We use the ggplot to show the relationship betwenn area, property tax with rent amount. The result are postively correlated.
attach(house_data)
## The following objects are masked from house_data (pos = 23):
##
## animal, area, bathroom, fire insurance, floor, furniture, hoa,
## parking spaces, property tax, rent amount, rooms
# ggplot
#ggplot(house_data, aes(x=`area`,y=`rent amount`)) + geom_point()
ggplot(house_data, aes(x=`area`,y=`rent amount`)) + geom_point(colour="steelblue", pch=3) +
labs(x="squares of area", y="Rent prices in Brazilian Reals", title="House Rent Prices")
ggplot(house_data, aes(x=`property tax`,y=`rent amount`)) + geom_point(colour="red", pch=3) +
labs(x="Proper Tax in Brazilian Reals", y="Rent prices in Brazilian Reals", title="House Rent Prices")
The bar chart and histogram is used to analyze distribution of rooms, bathrooms and parking spaces.
# bar chart
ggplot(house_data, aes(rooms)) + geom_bar(position="stack")
ggplot(house_data, aes(bathroom) )+ geom_bar(position="stack")
ggplot(house_data, aes(rooms)) + facet_grid(.~bathroom) + geom_bar(position="dodge")
# histogram
ggplot(house_data, aes(`parking spaces`))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(house_data, aes(`parking spaces`))+geom_histogram(aes(fill = after_stat(count)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Then we start to use regression tool to analyze the bivariate relationships.
ggplot(house_data, aes(x=`area`, y=`rent amount`)) + geom_point() + geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(house_data, aes(x=`rooms`, y=`rent amount`)) + geom_point() + stat_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(house_data, aes(x=`area`, y=`rent amount`)) + geom_point() + stat_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(house_data, aes(x=`rooms`, y=`rent amount`)) + geom_point() + geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
# violin plot
ggplot(house_data, aes(x=`area`, y=`rent amount`)) + geom_violin()
ggplot(house_data, aes(x=`rooms`, y=`rent amount`)) + geom_violin()
ggplot(house_data, aes(x=`bathroom`, y=`rent amount`)) + geom_violin()
# box plot
ggplot(house_data, aes(x=`area`, y=`rent amount`)) + geom_boxplot() + coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(house_data, aes(x=`rooms`, y=`rent amount`)) + geom_boxplot() + coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(house_data, aes(x=`bathroom`, y=`rent amount`)) + geom_boxplot() + coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(house_data, aes(x=`area`, y=`rent amount`)) + geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(house_data, aes(x=`rooms`, y=`rent amount`)) + geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggplot(house_data, aes(x=`bathroom`, y=`rent amount`)) + geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
# density plot and ggridges
ggplot(house_data, aes(x=`area`)) + geom_density()
ggplot(house_data, aes(x=`area`, fill=`rooms`, color=`rooms`)) + geom_density()
## Warning: The following aesthetics were dropped during statistical transformation: fill,
## colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(house_data, aes(x=`area`, fill=`rooms`, color=`rooms`)) + geom_density(alpha=0.3, aes(y=..scaled..))
## Warning: The dot-dot notation (`..scaled..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(scaled)` instead.
## The following aesthetics were dropped during statistical transformation: fill,
## colour ℹ This can happen when ggplot fails to infer the correct grouping
## structure in the data. ℹ Did you forget to specify a `group` aesthetic or to
## convert a numerical variable into a factor?
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(house_data, aes(x=`area`, y=`furniture`)) + geom_density_ridges(aes(fill = furniture))
## Picking joint bandwidth of 16.4
ggplot(house_data, aes(x=`rent amount`)) + geom_density()
ggplot(house_data, aes(x=`rent amount`, y=`furniture`)) + geom_density_ridges(aes(fill = furniture))
## Picking joint bandwidth of 472
# hexbin
ggplot(house_data, aes(x=`area`, y=`rent amount`)) + geom_hex()
# with ggthemes (see also ggsci, ggthemr)
lastplot <- ggplot(house_data, aes(x=`area`, y=`rent amount`)) + geom_point(aes(color=furniture)) + stat_smooth() +
labs(x="weight of diamond in carats", y="price of diamond in dollars", title="Diamond Data")
lastplot + theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lastplot + theme_cowplot()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lastplot + theme_dark()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lastplot + theme_economist()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lastplot + theme_fivethirtyeight()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lastplot + theme_tufte()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lastplot + theme_wsj()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'